#!/usr/bin/python 
from numpy import *

#Collaborators: Fred Landis, Alan Wu, Zen Yang

#Helper functions courtesy of Master McMillan
def GlobalAlign(v, w, match, mismatch, gap):
	s = zeros((len(v)+1,len(w)+1), dtype=int32)
	b = zeros((len(v)+1,len(w)+1), dtype=int32)

	for j in xrange(1,len(w)+1):
		s[0,j] = s[0,j-1] + gap
		b[0,j] = 1 

	for i in xrange(1,len(v)+1):
		s[i,0] = s[i-1,0] + gap
		b[i,0] = 2

	for i in xrange(1,len(v)+1):
		for j in xrange(1,len(w)+1):
			diag = s[i-1, j-1] + (match if (v[i-1] == w[j-1]) else mismatch)
			vskip = s[i-1, j] + gap
			wskip = s[i, j-1] + gap
			s[i,j] = max(vskip, wskip, diag)
			if (s[i,j] == wskip):
				b[i,j] = 1
			elif (s[i,j] == vskip):
				b[i,j] = 2
			else:
				b[i,j] = 3

	return s,b 
	#returns the score array and the backtrack array

def GlobalAlignForSubstring(v, w, match, mismatch, gap):
	s = zeros((len(v)+1,len(w)+1), dtype=int32)
	b = zeros((len(v)+1,len(w)+1), dtype=int32)

	#for j in xrange(1,len(w)+1):
	#	s[0,j] = s[0,j-1] + gap
	#	b[0,j] = 1 

	for i in xrange(1,len(v)+1):
		s[i,0] = s[i-1,0] + gap
		b[i,0] = 2

	for i in xrange(1,len(v)+1):
		for j in xrange(1,len(w)+1):
			diag = s[i-1, j-1] + (match if (v[i-1] == w[j-1]) else mismatch)
			vskip = s[i-1, j] + gap
			wskip = s[i, j-1] + gap
			s[i,j] = max(vskip, wskip, diag)
			if (s[i,j] == wskip):
				b[i,j] = 1
			elif (s[i,j] == vskip):
				b[i,j] = 2
			else:
				b[i,j] = 3

	return s,b 
	#returns the score array and the backtrack array

def alignV(b,v,i,j,local=False):
	if (b[i,j] == 0) and local:
		return ' '
	elif (i == 0):
		if (j == 0):
			return ' '
		else:
			return alignV(b,v,i,j-1,local) + '-'
	elif (b[i,j] == 3):
		return alignV(b,v,i-1,j-1,local) + v[i-1]
	elif (b[i,j] == 2):
		return alignV(b,v,i-1,j,local) + v[i-1]
	elif (b[i,j] == 1):
		return alignV(b,v,i,j-1,local) + '-'

def alignW(b,w,i,j,local=False):
	if (b[i,j] == 0) and local:
		return ' '
	elif (j == 0):
		if (i == 0):
			return ' '
		else:
			return alignW(b,v,i-1,j,local) + '-'
	elif (b[i,j] == 3):
		return alignW(b,w,i-1,j-1,local) + w[j-1]
	elif (b[i,j] == 2):
		return alignW(b,w,i-1,j,local) + '-'
	elif (b[i,j] == 1):
		return alignW(b,w,i,j-1,local) + w[j-1]

def alignWNonRecursive(b,w,i,j,local=False):
	while (b[i,j] != 0) and local:
		if (b[i,j] == 3):
			i = i-1
			j = j-1
		elif (b[i,j] == 2):
			i = i-1
		elif (b[i,j] == 1):
			j = j-1
	return j

#Problem 1

#It seems like this is just asking us to do Local Alignment between the fish and human sequence, and then the 
#difference from normal local alignment is that we're going to choose the alignment that starts in the first 
#colmun and ends in the last column (assuming human gene spans top row), because we need to make sure that the 
#resulting substring from the fish sequence actually has the start and end matching of the human sequence. 

with open("humanGene.seq", 'r') as sequenceFile:
	hGene = sequenceFile.read()
with open("mouseGene.seq", 'r') as sequenceFile:
	mGene = sequenceFile.read()
with open("yeastGene.seq", 'r') as sequenceFile:
	yGene = sequenceFile.read()
with open("zfishGene.seq", 'r') as sequenceFile:
	fGene = sequenceFile.read()


scores, backtrack = GlobalAlignForSubstring(hGene,fGene,1,0,-1)
#print scores
#print backtrack

#do argmax to get the index of the largest guy in the bottom row. Got 3038, which is what Fred got. 
#argmax(scores[len(hGene)]) #this gives us the end index we want for fish gene

#run nonRecursive to get the index of the start 
#print alignWNonRecursive(backtrack,fGene,len(hGene),3038,True) #this gives us the start index we want for the fish gene. 
#Got 317, which is what Fred got. 

#Problem 2 
#Could calculate hamming distances between all four strings, that is, tally up the numbers of bases that differ 
#from the consenusus, then find the longest consecutive sequence of those 4's (full agreement). 9 characters in common. 

def longestCommonSubstring(hGene,mGene,fGene,yGene):
    bestsofar=0
    answer = []
    for i in xrange(len(hGene)):
        for j in xrange(len(mGene)):
            if mGene[j:j+bestsofar+1] != hGene[i:i+bestsofar+1]: #pruning time!
                continue
            for k in xrange(len(fGene)):
                if mGene[k:k+bestsofar+1] != hGene[i:i+bestsofar+1]: #pruning time!
                    continue
                for l in xrange(len(yGene)):
                    if mGene[l:l+bestsofar+1] != hGene[i:i+bestsofar+1]: #pruning time!
                        continue
                    maxlen = min(len(hGene)-i,len(mGene)-j,len(fGene)-k,len(yGene)-1)
                    for m in xrange(bestsofar+1,maxlen): #pruning time!
                        if hGene[i+m] != mGene[j+m]:
                            break
                        if hGene[i+m] != fGene[k+m]:
                            break
                        if hGene[i+m] != yGene[l+m]:
                            break
                    if(m>bestsofar):
                        bestsofar=m
                        answer=[i,j,k,l]
    print answer

def get_all_substrings_return_dictionary_with_initial_count(string):
  length = len(string)
  adict = {}
  for i in xrange(length):
    for j in xrange(i,length):
      adict[string[i:j + 1]]=1 
  return adict

substringsOfYeast = get_all_substrings_return_dictionary_with_initial_count(yGene)
substringsOfHuman = get_all_substrings_return_dictionary_with_initial_count(hGene)
substringsOfMouse = get_all_substrings_return_dictionary_with_initial_count(mGene)
substringsOfFish = get_all_substrings_return_dictionary_with_initial_count(fGene)

#we save runtime if we only look at values that have been in all of them so far

substringsInYH = []
for substring in substringsOfYeast:
	if substring in substringsOfHuman: 
		substringsInYH.append(substring)
#print substringsInYH

substringsInYHM = []
for substring in substringsInYH:
	if substring in substringsOfMouse: 
		substringsInYHM.append(substring)
#print substringsInYHM

substringsInAll= []
for substring in substringsInYHM:
	if substring in substringsOfFish: 
		substringsInAll.append(substring)

print max(substringsInAll, key=len)

#Problem 3

#Take the output of problem 1 in, because that's our last normalized sequence. 
fGene = fGene[317:3038]

score, back = GlobalAlign(yGene, hGene,1,0,-1)
print "Yeast and Human: ",score[len(yGene)][len(hGene)]

score, back = GlobalAlign(yGene, mGene,1,0,-1)
print "Yeast and Mouse: ",score[len(yGene)][len(mGene)]

score, back = GlobalAlign(yGene, fGene,1,0,-1)
print "Yeast and Fish: ",score[len(yGene)][len(fGene)]

score, back = GlobalAlign(fGene, hGene,1,0,-1)
print "Fish and Human: ",score[len(fGene)][len(hGene)]

score, back = GlobalAlign(fGene, mGene,1,0,-1)
print "Fish and Mouse: ",score[len(fGene)][len(mGene)]

score, back = GlobalAlign(mGene, hGene,1,0,-1)
print "Mouse and Human: ",score[len(mGene)][len(hGene)]

